* Summary statistics for four samples: 2017 cross-section all ages; 
* 2005-2017 panel all ages; 2017 cross-section ages 40 to 55; 
* 2017 cross-section ages 56 to 70
	
	frames reset 
	frame create desc speccat npi median
	use "${clean_data}/panel_physicians_clean.dta", clear

	* Set locals
	encode spec_category_desc, g(spec_category_num)
	levelsof spec_category_desc, loc(speclab)
	loc specno = `r(r)'
	loc varlist "age female is_foreign married observed_in_acs totw2comp ptotinc aginc with25Kbizinc top1 einsize wkh retired_1099ssa observe_mdsch ranked_school top5_school"
	
	cap log close
	log using "${output}/descriptives/desc01-sum_stats.txt", replace text 
	
	* Loop over samples
	foreach i in 1 2 3 4 {
		if `i' == 1	gl restriction = ""
		if `i' == 2	gl restriction = "if year==2017"
		if `i' == 3	gl restriction = "if year==2017 & inrange(age, 40, 55)"
		if `i' == 4	gl restriction = "if year==2017 & inrange(age, 56, 70)"
		
		* Panel A: Summarize variables for all four samples 
		gstats tab `varlist' $restriction, s(mean sd count) pretty matasave(desc`i')
		mata: st_matrix("panelA`i'", desc`i'.output)
		
			// Add median
			mat median = J(`: word count `varlist'',2,.)
			loc j = 1
			foreach var of varl `varlist' {
			preserve
				if `i'> 1 {
					keep $restriction
				}
				
				qui centile `var', centile(49.99 50.01)
				qui su `var' if inrange(`var',`r(c_1)',`r(c_2)')
				if `r(N)' < 11 {
					di "Less than 11 observations between 49.99th and 50.01st percentile, use 49.9th and 50.1st percentile instead"
					qui centile `var', centile(49.90 50.1)
					qui su `var' if inrange(`var',`r(c_1)',`r(c_2)')
					assert `r(N)' > 10
			
				}
				
				mat median[`j',1] = `r(mean)'
				mat median[`j',2] = `r(N)'
				
			loc ++j 	
			restore
			}

		* Panel B: Tabulate specialty categories for all four samples
		tab spec_category_num $restriction , m matcell(tmp`i')
		mata: tmp`i' = st_matrix("tmp`i'")
		mata: st_matrix("panelB`i'", tmp`i')

		* Panel C: Counts of unique NPIs
		gunique npi $restriction
		loc unique = `r(unique)'
		loc N = `r(N)'
	
***	
	log off
		
		* Save matrices in frames
		preserve 
			// Panel A  
			#d ;
			clear; svmat panelA`i' ; g depno = _n ; g depvar = "" ; g panel = "A" ; ren (panelA`i'1 panelA`i'2 panelA`i'3) (mean stdev N_A_mean) ; 
			g sample = subinstr(subinstr(subinstr(subinstr("$restriction","if ","",.),"inrange(age,","ages",.),", ","-",.),")","",.) ;
			#d cr
			
			g median = . 
			g N_A_med = .
			
			foreach j of numl 1(1)`: word count `varlist'' {
				replace median = median[`j',1] if _n == `j'
				replace N_A_med = median[`j',2] if _n == `j'
			}
			tempfile panelA`i'
			save `panelA`i''
			
			// Panel B
			#d ;
			clear; svmat panelB`i' ; g depno = 100+_n ; g spec_category_num = _n ; g depvar = "spec_category_desc: " ; g panel = "B" ; ren (panelB`i'1) (N_B) ; 
			g sample = subinstr(subinstr(subinstr(subinstr("$restriction","if ","",.),"inrange(age,","ages",.),", ","-",.),")","",.) ;
			#d cr
			tempfile panelB`i'
			save `panelB`i''
			
			// Panel C
			#d ;
			clear; set obs 1 ; g depno = 200+_n ; g depvar = "no. observations" ; g no_unique_npi = `unique' ; g N_phys_year = `N' ; g panel = "C" ; 
			g sample = subinstr(subinstr(subinstr(subinstr("$restriction","if ","",.),"inrange(age,","ages",.),", ","-",.),")","",.) ;
			#d cr 
			tempfile panelC`i'
			save `panelC`i''
		restore
		
		frame desc {
			append using `panelA`i''
			append using `panelB`i''
			append using `panelC`i''
		} 
***
	log on 
	}
***
	log close
	cap macro drop restriction

	* Format data 
	frame change desc

	replace sample = "All" if sample == ""
	levelsof depno, loc(levels)
	foreach level in `levels' {
		replace depvar = "`:word `level' of `varlist''" if depno == `level' & depvar == ""
	}
		
	foreach level of numl 1(1)`specno' {
		replace depvar = depvar + "`:word `level' of `speclab''" if depvar == "spec_category_desc: " & spec_category_num == `level'
	}
	replace depvar = "Missing" if depvar == "spec_category_desc: "
	drop speccat npi spec_category_num
	
	* Compute shares, rounded counts and collect unrounded N
	g N_UR_Shares = N_B
	g N_UR_Mean_SD = N_A_mean
	g N_UR_Median = N_A_med
	g N_UR_unique_npi = no_unique_npi
	g N_UR_phys_years = N_phys_year
	ren N_A_mean count
	
	* Output 
	bys sample: egen aux = max(N_phys_year)
	g share_sample = N_B/aux if panel == "B"
	drop aux N_B N_A_med
	drbest_docinc share_sample mean median stdev
	drbcount count no_unique_npi N_phys_year, replace
	order panel depno depvar sample count no* N_phys* mean median stdev share* N_UR*
	export delimited using "$mypath/intermediate_csv/desc01-sum_stats.csv", replace dataf delim(tab)  
	
